Initialise the libs


In [119]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np

from math import ceil

Load the data


In [141]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression/datasets/'

sales = pd.read_csv(regressionDir + 'kc_house_data.csv', dtype = dtype_dict)
sales = sales.sort(['sqft_living','price'])

# dtype_dict same as above
set_1 = pd.read_csv(regressionDir + 'wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv(regressionDir + 'wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv(regressionDir + 'wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv(regressionDir + 'wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

train_valid_shuffled = pd.read_csv(regressionDir + 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv(regressionDir + 'wk3_kc_house_test_data.csv', dtype=dtype_dict)
training =  pd.read_csv(regressionDir + 'wk3_kc_house_train_data.csv', dtype=dtype_dict)


/home/weenkus/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)

Data exploration


In [121]:
# Show plots in jupyter
%matplotlib inline

sales.head()


Out[121]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
19452 3980300371 20140926T000000 142000 0 0.00 290 20875 1 0 0 ... 1 290 0 1963 0 98024 47.5308 -121.888 1620 22850
15381 2856101479 20140701T000000 276000 1 0.75 370 1801 1 0 0 ... 5 370 0 1923 0 98117 47.6778 -122.389 1340 5000
860 1723049033 20140620T000000 245000 1 0.75 380 15000 1 0 0 ... 5 380 0 1963 0 98168 47.4810 -122.323 1170 15000
18379 1222029077 20141029T000000 265000 0 0.75 384 213444 1 0 0 ... 4 384 0 2003 0 98070 47.4177 -122.491 1920 224341
4868 6896300380 20141002T000000 228000 0 1.00 390 5900 1 0 0 ... 4 390 0 1953 0 98118 47.5260 -122.261 2170 6000

5 rows × 21 columns


In [122]:
sales['price'].head()


Out[122]:
19452    142000
15381    276000
860      245000
18379    265000
4868     228000
Name: price, dtype: float64

Helper functions


In [123]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
    # assume that degree >= 1
    # initialize the dataframe:
    poly_dataframe = pd.DataFrame()
    # and set poly_dataframe['power_1'] equal to the passed feature
    poly_dataframe['power_1'] = feature

    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_dataframe[name] to be feature^power; use apply(*)
            poly_dataframe[name] = feature;
            poly_dataframe[name] = poly_dataframe[name].apply(lambda x: x**power)
    return poly_dataframe

Ridge regression model fitting


In [124]:
poly15_data = polynomial_dataframe(sales['sqft_living'], 15) # use equivalent of `polynomial_sframe`
print(poly15_data)


       power_1    power_2       power_3       power_4       power_5  \
19452      290      84100  2.438900e+07  7.072810e+09  2.051115e+12   
15381      370     136900  5.065300e+07  1.874161e+10  6.934396e+12   
860        380     144400  5.487200e+07  2.085136e+10  7.923517e+12   
18379      384     147456  5.662310e+07  2.174327e+10  8.349416e+12   
4868       390     152100  5.931900e+07  2.313441e+10  9.022420e+12   
21332      390     152100  5.931900e+07  2.313441e+10  9.022420e+12   
8623       410     168100  6.892100e+07  2.825761e+10  1.158562e+13   
11500      420     176400  7.408800e+07  3.111696e+10  1.306912e+13   
14466      420     176400  7.408800e+07  3.111696e+10  1.306912e+13   
465        430     184900  7.950700e+07  3.418801e+10  1.470084e+13   
1168       440     193600  8.518400e+07  3.748096e+10  1.649162e+13   
18052      460     211600  9.733600e+07  4.477456e+10  2.059630e+13   
12075      470     220900  1.038230e+08  4.879681e+10  2.293450e+13   
15248      470     220900  1.038230e+08  4.879681e+10  2.293450e+13   
17394      480     230400  1.105920e+08  5.308416e+10  2.548040e+13   
8133       480     230400  1.105920e+08  5.308416e+10  2.548040e+13   
4203       490     240100  1.176490e+08  5.764801e+10  2.824752e+13   
4651       500     250000  1.250000e+08  6.250000e+10  3.125000e+13   
2141       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
3223       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
18059      520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
2930       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
6779       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
9641       520     270400  1.406080e+08  7.311616e+10  3.802040e+13   
12484      530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
3582       530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
7017       530     280900  1.488770e+08  7.890481e+10  4.181955e+13   
14450      540     291600  1.574640e+08  8.503056e+10  4.591650e+13   
12757      540     291600  1.574640e+08  8.503056e+10  4.591650e+13   
18508      550     302500  1.663750e+08  9.150625e+10  5.032844e+13   
...        ...        ...           ...           ...           ...   
12872     6980   48720400  3.400684e+11  2.373677e+15  1.656827e+19   
15482     7000   49000000  3.430000e+11  2.401000e+15  1.680700e+19   
7035      7050   49702500  3.504026e+11  2.470339e+15  1.741589e+19   
4149      7080   50126400  3.548949e+11  2.512656e+15  1.778960e+19   
7907      7100   50410000  3.579110e+11  2.541168e+15  1.804229e+19   
21050     7120   50694400  3.609441e+11  2.569922e+15  1.829785e+19   
6501      7220   52128400  3.763670e+11  2.717370e+15  1.961941e+19   
21506     7270   52852900  3.842406e+11  2.793429e+15  2.030823e+19   
18594     7320   53582400  3.922232e+11  2.871074e+15  2.101626e+19   
2713      7350   54022500  3.970654e+11  2.918431e+15  2.145046e+19   
1315      7390   54612100  4.035834e+11  2.982481e+15  2.204054e+19   
10373     7400   54760000  4.052240e+11  2.998658e+15  2.219007e+19   
11871     7420   55056400  4.085185e+11  3.031207e+15  2.249156e+19   
12370     7440   55353600  4.118308e+11  3.064021e+15  2.279632e+19   
4024      7480   55950400  4.185090e+11  3.130447e+15  2.341575e+19   
14032     7620   58064400  4.424507e+11  3.371475e+15  2.569064e+19   
18477     7710   59444100  4.583140e+11  3.533601e+15  2.724406e+19   
16773     7730   59752900  4.618899e+11  3.570409e+15  2.759926e+19   
19858     7850   61622500  4.837366e+11  3.797333e+15  2.980906e+19   
13411     7880   62094400  4.893039e+11  3.855715e+15  3.038303e+19   
1448      8000   64000000  5.120000e+11  4.096000e+15  3.276800e+19   
1164      8010   64160100  5.139224e+11  4.116518e+15  3.297331e+19   
18302     8020   64320400  5.158496e+11  4.137114e+15  3.317965e+19   
14556     8670   75168900  6.517144e+11  5.650364e+15  4.898865e+19   
4411      9200   84640000  7.786880e+11  7.163930e+15  6.590815e+19   
8092      9640   92929600  8.958413e+11  8.635911e+15  8.325018e+19   
9254      9890   97812100  9.673617e+11  9.567207e+15  9.461968e+19   
3914     10040  100801600  1.012048e+12  1.016096e+16  1.020161e+20   
7252     12050  145202500  1.749690e+12  2.108377e+16  2.540594e+20   
12777    13540  183331600  2.482310e+12  3.361048e+16  4.550858e+20   

            power_6       power_7       power_8       power_9      power_10  \
19452  5.948233e+14  1.724988e+17  5.002464e+19  1.450715e+22  4.207072e+24   
15381  2.565726e+15  9.493188e+17  3.512479e+20  1.299617e+23  4.808584e+25   
860    3.010936e+15  1.144156e+18  4.347792e+20  1.652161e+23  6.278212e+25   
18379  3.206176e+15  1.231172e+18  4.727699e+20  1.815436e+23  6.971275e+25   
4868   3.518744e+15  1.372310e+18  5.352009e+20  2.087284e+23  8.140406e+25   
21332  3.518744e+15  1.372310e+18  5.352009e+20  2.087284e+23  8.140406e+25   
8623   4.750104e+15  1.947543e+18  7.984925e+20  3.273819e+23  1.342266e+26   
11500  5.489032e+15  2.305393e+18  9.682652e+20  4.066714e+23  1.708020e+26   
14466  5.489032e+15  2.305393e+18  9.682652e+20  4.066714e+23  1.708020e+26   
465    6.321363e+15  2.718186e+18  1.168820e+21  5.025926e+23  2.161148e+26   
1168   7.256314e+15  3.192778e+18  1.404822e+21  6.181218e+23  2.719736e+26   
18052  9.474297e+15  4.358177e+18  2.004761e+21  9.221902e+23  4.242075e+26   
12075  1.077922e+16  5.066231e+18  2.381129e+21  1.119130e+24  5.259913e+26   
15248  1.077922e+16  5.066231e+18  2.381129e+21  1.119130e+24  5.259913e+26   
17394  1.223059e+16  5.870683e+18  2.817928e+21  1.352605e+24  6.492506e+26   
8133   1.223059e+16  5.870683e+18  2.817928e+21  1.352605e+24  6.492506e+26   
4203   1.384129e+16  6.782231e+18  3.323293e+21  1.628414e+24  7.979227e+26   
4651   1.562500e+16  7.812500e+18  3.906250e+21  1.953125e+24  9.765625e+26   
2141   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
3223   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
18059  1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
2930   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
6779   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
9641   1.977061e+16  1.028072e+19  5.345973e+21  2.779906e+24  1.445551e+27   
12484  2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
3582   2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
7017   2.216436e+16  1.174711e+19  6.225969e+21  3.299764e+24  1.748875e+27   
14450  2.479491e+16  1.338925e+19  7.230196e+21  3.904306e+24  2.108325e+27   
12757  2.479491e+16  1.338925e+19  7.230196e+21  3.904306e+24  2.108325e+27   
18508  2.768064e+16  1.522435e+19  8.373394e+21  4.605367e+24  2.532952e+27   
...             ...           ...           ...           ...           ...   
12872  1.156465e+23  8.072126e+26  5.634344e+30  3.932772e+34  2.745075e+38   
15482  1.176490e+23  8.235430e+26  5.764801e+30  4.035361e+34  2.824752e+38   
7035   1.227820e+23  8.656131e+26  6.102572e+30  4.302313e+34  3.033131e+38   
4149   1.259504e+23  8.917288e+26  6.313440e+30  4.469916e+34  3.164700e+38   
7907   1.281003e+23  9.095120e+26  6.457535e+30  4.584850e+34  3.255244e+38   
21050  1.302807e+23  9.275983e+26  6.604500e+30  4.702404e+34  3.348112e+38   
6501   1.416522e+23  1.022729e+27  7.384100e+30  5.331320e+34  3.849213e+38   
21506  1.476408e+23  1.073349e+27  7.803246e+30  5.672960e+34  4.124242e+38   
18594  1.538390e+23  1.126102e+27  8.243064e+30  6.033923e+34  4.416831e+38   
2713   1.576609e+23  1.158808e+27  8.517237e+30  6.260169e+34  4.601224e+38   
1315   1.628796e+23  1.203680e+27  8.895196e+30  6.573550e+34  4.857853e+38   
10373  1.642065e+23  1.215128e+27  8.991947e+30  6.654041e+34  4.923990e+38   
11871  1.668874e+23  1.238304e+27  9.188217e+30  6.817657e+34  5.058701e+38   
12370  1.696046e+23  1.261858e+27  9.388225e+30  6.984839e+34  5.196720e+38   
4024   1.751498e+23  1.310120e+27  9.799700e+30  7.330176e+34  5.482971e+38   
14032  1.957626e+23  1.491711e+27  1.136684e+31  8.661533e+34  6.600088e+38   
18477  2.100517e+23  1.619499e+27  1.248634e+31  9.626965e+34  7.422390e+38   
16773  2.133423e+23  1.649136e+27  1.274782e+31  9.854066e+34  7.617193e+38   
19858  2.340011e+23  1.836909e+27  1.441973e+31  1.131949e+35  8.885801e+38   
13411  2.394183e+23  1.886616e+27  1.486653e+31  1.171483e+35  9.231285e+38   
1448   2.621440e+23  2.097152e+27  1.677722e+31  1.342177e+35  1.073742e+39   
1164   2.641162e+23  2.115571e+27  1.694572e+31  1.357352e+35  1.087239e+39   
18302  2.661008e+23  2.134129e+27  1.711571e+31  1.372680e+35  1.100889e+39   
14556  4.247316e+23  3.682423e+27  3.192661e+31  2.768037e+35  2.399888e+39   
4411   6.063550e+23  5.578466e+27  5.132189e+31  4.721614e+35  4.343885e+39   
8092   8.025317e+23  7.736406e+27  7.457895e+31  7.189411e+35  6.930592e+39   
9254   9.357886e+23  9.254949e+27  9.153145e+31  9.052460e+35  8.952883e+39   
3914   1.024241e+24  1.028338e+28  1.032452e+32  1.036581e+36  1.040728e+40   
7252   3.061416e+24  3.689006e+28  4.445252e+32  5.356529e+36  6.454617e+40   
12777  6.161862e+24  8.343162e+28  1.129664e+33  1.529565e+37  2.071031e+41   

           power_11      power_12      power_13      power_14      power_15  
19452  1.220051e+27  3.538148e+29  1.026063e+32  2.975582e+34  8.629189e+36  
15381  1.779176e+28  6.582952e+30  2.435692e+33  9.012061e+35  3.334463e+38  
860    2.385721e+28  9.065738e+30  3.444980e+33  1.309093e+36  4.974552e+38  
18379  2.676970e+28  1.027956e+31  3.947353e+33  1.515783e+36  5.820608e+38  
4868   3.174758e+28  1.238156e+31  4.828807e+33  1.883235e+36  7.344616e+38  
21332  3.174758e+28  1.238156e+31  4.828807e+33  1.883235e+36  7.344616e+38  
8623   5.503290e+28  2.256349e+31  9.251031e+33  3.792923e+36  1.555098e+39  
11500  7.173683e+28  3.012947e+31  1.265438e+34  5.314838e+36  2.232232e+39  
14466  7.173683e+28  3.012947e+31  1.265438e+34  5.314838e+36  2.232232e+39  
465    9.292937e+28  3.995963e+31  1.718264e+34  7.388536e+36  3.177070e+39  
1168   1.196684e+29  5.265409e+31  2.316780e+34  1.019383e+37  4.485286e+39  
18052  1.951354e+29  8.976230e+31  4.129066e+34  1.899370e+37  8.737103e+39  
12075  2.472159e+29  1.161915e+32  5.461000e+34  2.566670e+37  1.206335e+40  
15248  2.472159e+29  1.161915e+32  5.461000e+34  2.566670e+37  1.206335e+40  
17394  3.116403e+29  1.495873e+32  7.180192e+34  3.446492e+37  1.654316e+40  
8133   3.116403e+29  1.495873e+32  7.180192e+34  3.446492e+37  1.654316e+40  
4203   3.909821e+29  1.915812e+32  9.387480e+34  4.599865e+37  2.253934e+40  
4651   4.882812e+29  2.441406e+32  1.220703e+35  6.103516e+37  3.051758e+40  
2141   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
3223   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
18059  7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
2930   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
6779   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
9641   7.516866e+29  3.908770e+32  2.032560e+35  1.056931e+38  5.496043e+40  
12484  9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
3582   9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
7017   9.269036e+29  4.912589e+32  2.603672e+35  1.379946e+38  7.313715e+40  
14450  1.138496e+30  6.147876e+32  3.319853e+35  1.792721e+38  9.680692e+40  
12757  1.138496e+30  6.147876e+32  3.319853e+35  1.792721e+38  9.680692e+40  
18508  1.393123e+30  7.662179e+32  4.214198e+35  2.317809e+38  1.274795e+41  
...             ...           ...           ...           ...           ...  
12872  1.916062e+42  1.337412e+46  9.335133e+49  6.515923e+53  4.548114e+57  
15482  1.977327e+42  1.384129e+46  9.688901e+49  6.782231e+53  4.747562e+57  
7035   2.138357e+42  1.507542e+46  1.062817e+50  7.492860e+53  5.282467e+57  
4149   2.240608e+42  1.586350e+46  1.123136e+50  7.951803e+53  5.629876e+57  
7907   2.311223e+42  1.640968e+46  1.165087e+50  8.272121e+53  5.873206e+57  
21050  2.383856e+42  1.697305e+46  1.208481e+50  8.604387e+53  6.126323e+57  
6501   2.779132e+42  2.006533e+46  1.448717e+50  1.045974e+54  7.551930e+57  
21506  2.998324e+42  2.179781e+46  1.584701e+50  1.152078e+54  8.375605e+57  
18594  3.233121e+42  2.366644e+46  1.732384e+50  1.268105e+54  9.282527e+57  
2713   3.381900e+42  2.485696e+46  1.826987e+50  1.342835e+54  9.869839e+57  
1315   3.589953e+42  2.652976e+46  1.960549e+50  1.448846e+54  1.070697e+58  
10373  3.643753e+42  2.696377e+46  1.995319e+50  1.476536e+54  1.092637e+58  
11871  3.753557e+42  2.785139e+46  2.066573e+50  1.533397e+54  1.137781e+58  
12370  3.866360e+42  2.876572e+46  2.140169e+50  1.592286e+54  1.184661e+58  
4024   4.101263e+42  3.067744e+46  2.294673e+50  1.716415e+54  1.283879e+58  
14032  5.029267e+42  3.832301e+46  2.920214e+50  2.225203e+54  1.695605e+58  
18477  5.722663e+42  4.412173e+46  3.401785e+50  2.622777e+54  2.022161e+58  
16773  5.888090e+42  4.551494e+46  3.518304e+50  2.719649e+54  2.102289e+58  
19858  6.975354e+42  5.475653e+46  4.298387e+50  3.374234e+54  2.648774e+58  
13411  7.274253e+42  5.732111e+46  4.516904e+50  3.559320e+54  2.804744e+58  
1448   8.589935e+42  6.871948e+46  5.497558e+50  4.398047e+54  3.518437e+58  
1164   8.708787e+42  6.975739e+46  5.587567e+50  4.475641e+54  3.584988e+58  
18302  8.829133e+42  7.080965e+46  5.678934e+50  4.554505e+54  3.652713e+58  
14556  2.080703e+43  1.803969e+47  1.564041e+51  1.356024e+55  1.175673e+59  
4411   3.996374e+43  3.676664e+47  3.382531e+51  3.111928e+55  2.862974e+59  
8092   6.681091e+43  6.440572e+47  6.208711e+51  5.985197e+55  5.769730e+59  
9254   8.854401e+43  8.757003e+47  8.660676e+51  8.565409e+55  8.471189e+59  
3914   1.044891e+44  1.049070e+48  1.053266e+52  1.057480e+56  1.061709e+60  
7252   7.777813e+44  9.372265e+48  1.129358e+53  1.360876e+57  1.639856e+61  
12777  2.804176e+45  3.796855e+49  5.140941e+53  6.960834e+57  9.424970e+61  

[21613 rows x 15 columns]

In [125]:
l2_small_penalty = 1.5e-5
model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model.fit(poly15_data, sales['price'])


Out[125]:
Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [126]:
model.coef_


Out[126]:
array([  1.24873306e+02,  -4.77376011e-02,   3.01446238e-05,
        -2.44419942e-09,  -1.94153675e-13,   8.54085686e-18,
         1.51142121e-21,   8.27979094e-26,   6.52603100e-31,
        -3.27895017e-34,  -3.87962315e-38,  -2.72437650e-42,
        -1.07790800e-46,   3.78242694e-51,   1.39790296e-54])

In [127]:
plt.plot(poly15_data, model.predict(poly15_data), poly15_data, sales['price'])
plt.show()


Ridge regression on subsets

Using ridge regression with small l2


In [128]:
l2_small_penalty=1e-9

poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])

poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])

poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])

poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])


Out[128]:
Ridge(alpha=1e-09, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [129]:
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()

plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()

plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()

plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()



In [130]:
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)


Model 1 coefficients:  [  5.44669376e+02  -3.55447580e-01   1.22446368e-04  -1.17175278e-08
  -3.90512972e-13  -1.39075896e-17   1.47860259e-20   6.87492376e-25
  -7.57204175e-29  -1.04097336e-32  -3.71843943e-37   3.39989317e-41
   5.56591999e-45   2.53761435e-49  -3.35152943e-53]
Model 2 coefficients:  [  8.59362612e+02  -8.18118183e-01   4.28879879e-04  -9.12770077e-08
  -2.69606133e-12   3.73980536e-15  -1.42711908e-19  -6.30794906e-23
  -1.44559687e-27   7.44321610e-31   9.25866075e-35   3.27974536e-41
  -1.29543502e-42  -1.38781261e-46   1.66546452e-50]
Model 3 coefficients:  [ -7.55395916e+02   9.75579484e-01  -4.58945974e-04   7.77958020e-08
   7.15013519e-12  -2.88601996e-15  -2.13677987e-20   3.38085238e-23
   2.19178142e-27  -1.97067733e-31  -4.15993090e-35  -1.80196317e-39
   3.19071186e-43   5.08456981e-47  -3.93304294e-51]
Model 4 coefficients:  [  1.11944572e+03  -9.83760236e-01   3.38770920e-04   3.60377089e-08
  -4.37813981e-11   5.77191626e-15   7.66795302e-19  -9.49297780e-23
  -1.96030805e-26  -2.10881952e-32   3.31005108e-34   3.47733782e-38
  -2.43039194e-42  -8.79553285e-46   6.44569669e-50]

Applying a higher L2 value


In [131]:
l2_large_penalty=1.23e2

poly15_data_set1 = polynomial_dataframe(set_1['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model1 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model1.fit(poly15_data_set1, set_1['price'])

poly15_data_set2 = polynomial_dataframe(set_2['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model2 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model2.fit(poly15_data_set2, set_2['price'])

poly15_data_set3 = polynomial_dataframe(set_3['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model3 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model3.fit(poly15_data_set3, set_3['price'])

poly15_data_set4 = polynomial_dataframe(set_4['sqft_living'], 15) # use equivalent of `polynomial_sframe`
model4 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model4.fit(poly15_data_set4, set_4['price'])


Out[131]:
Ridge(alpha=123.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [132]:
plt.plot(poly15_data_set1, model1.predict(poly15_data_set1), poly15_data_set1, set_1['price'])
plt.show()

plt.plot(poly15_data_set2, model2.predict(poly15_data_set2), poly15_data_set2, set_2['price'])
plt.show()

plt.plot(poly15_data_set3, model3.predict(poly15_data_set3), poly15_data_set3, set_3['price'])
plt.show()

plt.plot(poly15_data_set4, model4.predict(poly15_data_set4), poly15_data_set4, set_4['price'])
plt.show()



In [133]:
print('Model 1 coefficients: ', model1.coef_)
print('Model 2 coefficients: ', model2.coef_)
print('Model 3 coefficients: ', model3.coef_)
print('Model 4 coefficients: ', model4.coef_)


Model 1 coefficients:  [  2.32806803e+00   3.53621608e-04   3.31969692e-08   2.00082477e-12
   1.11492559e-16   6.57786122e-21   4.12939525e-25   2.70393755e-29
   1.81614763e-33   1.23824277e-37   8.51872481e-42   5.89455598e-46
   4.09542560e-50   2.85464889e-54   1.99547476e-58]
Model 2 coefficients:  [  2.09756903e+00   3.90817483e-04   6.67189944e-08   8.90002997e-12
   9.72639877e-16   9.69733682e-20   9.50564475e-24   9.44491031e-28
   9.57191338e-32   9.86945155e-36   1.03101115e-39   1.08729784e-43
   1.15453748e-47   1.23211305e-51   1.31986696e-55]
Model 3 coefficients:  [  2.28906258e+00   4.12472190e-04   6.08835345e-08   6.58572163e-12
   6.15278155e-16   5.64446634e-20   5.28834396e-24   5.07091402e-28
   4.94657273e-32   4.88043809e-36   4.85009106e-40   4.84161534e-44
   4.84635021e-48   4.85883628e-52   4.87558469e-56]
Model 4 coefficients:  [  2.08596194e+00   4.05035772e-04   7.46864647e-08   1.13096608e-11
   1.45864442e-15   1.73561251e-19   2.01609632e-23   2.34605255e-27
   2.75636073e-31   3.27043069e-35   3.91046855e-39   4.70118041e-43
   5.67212304e-47   6.85958087e-51   8.30843630e-55]

Selecting an L2 penalty via cross-validation

Just like the polynomial degree, the L2 penalty is a "magic" parameter we need to select. We could use the validation set approach as we did in the last module, but that approach has a major disadvantage: it leaves fewer observations available for training. Cross-validation seeks to overcome this issue by using all of the training set in a smart way.

We will implement a kind of cross-validation called k-fold cross-validation. The method gets its name because it involves dividing the training set into k segments of roughtly equal size. Similar to the validation set method, we measure the validation error with one of the segments designated as the validation set. The major difference is that we repeat the process k times as follows:

Set aside segment 0 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

Set aside segment 1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

...

Set aside segment k-1 as the validation set, and fit a model on rest of data, and evalutate it on this validation set

After this process, we compute the average of the k validation errors, and use it as an estimate of the generalization error. Notice that all observations are used for both training and validation, as we iterate over segments of data.


In [134]:
def k_fold_cross_validation(k, l2_penalty, data, output):
    n = len(data)
    sumRSS = 0
    
    for i in range(k):
        # Get the validation/training interval
        start = (n*i)/k
        end = (n*(i+1))/k-1
        #print (i, (ceil(start), ceil(end)))
        train_valid_shuffled[0:ceil(start)].append(train_valid_shuffled[ceil(end)+1:n])   
        
        # Train the model
        model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
        model.fit(data, output)
    
        # Calculate RSS
        RSS = (abs(output - model.predict(data)) ** 2).sum()
        
        # Add the RSS to the sum for computing the average
        sumRSS += RSS
    return (sumRSS / k)

In [135]:
print (k_fold_cross_validation(10, 1e-9, poly15_data_set2, set_2['price']))


296862792315373.44

Minimize the l2 by using cross validation


In [136]:
import sys

l2s = np.logspace(3, 9, num=13)
train_valid_shuffled_poly15 = polynomial_dataframe(train_valid_shuffled['sqft_living'], 15)
k = 10

minError = sys.maxsize
for l2 in l2s:
    avgError = k_fold_cross_validation(k, l2, train_valid_shuffled_poly15, train_valid_shuffled['price'])
    print ('For l2:', l2, ' the CV is ', avgError)
    if avgError < minError:
        minError = avgError 
        bestl2 = l2
        
print (minError)
print (bestl2)


For l2: 1000.0  the CV is  2650520195070680.0
For l2: 3162.27766017  the CV is  2657012027282295.0
For l2: 10000.0  the CV is  2659080208536254.5
For l2: 31622.7766017  the CV is  2659735771406552.5
For l2: 100000.0  the CV is  2659943233824360.5
For l2: 316227.766017  the CV is  2660008854743339.0
For l2: 1000000.0  the CV is  2660029607454739.5
For l2: 3162277.66017  the CV is  2660036170193829.0
For l2: 10000000.0  the CV is  2660038245529684.5
For l2: 31622776.6017  the CV is  2660038901810067.5
For l2: 100000000.0  the CV is  2660039109344276.0
For l2: 316227766.017  the CV is  2660039174972390.5
For l2: 1000000000.0  the CV is  2660039195725831.0
2650520195070680.0
1000.0

Use the best l2 to train the model on all the data


In [161]:
model = linear_model.Ridge(alpha=1000, normalize=True)
model.fit(training[['sqft_living']], training['price'])


Out[161]:
Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [163]:
print("Residual sum of squares: %.2f"
  % ((model.predict(test[['sqft_living']]) - test['price']) ** 2).sum())


Residual sum of squares: 284428436980723.12

In [ ]:


In [ ]: